/**
 * OpenKM, Open Document Management System (http://www.openkm.com)
 * Copyright (c) 2006-2012 Paco Avila & Josep Llort
 * 
 * No bytes were intentionally harmed during the development of this application.
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */

package com.openkm.kea.filter;

import java.util.Enumeration;
import java.util.Vector;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.Utils;
import weka.filters.Filter;

/**
 * This filter splits the text in selected string
 * attributes into phrases. The resulting
 * string attributes contain these phrases
 * separated by '\n' characters.
 * 
 * Phrases are identified according to the
 * following definitions:
 * 
 * A phrase is a sequence of words interrupted
 * only by sequences of whitespace characters,
 * where each sequence of whitespace characters
 * contains at most one '\n'.
 * 
 * A word is a sequence of letters or digits
 * that contains at least one letter, with
 * the following exceptions:
 * 
 * a) '.', '@', '_', '&', '/', '-' are allowed
 * if surrounded by letters or digits,
 * 
 * b) '\'' is allowed if preceeded by a letter
 * or digit,
 * 
 * c) '-', '/' are also allowed if succeeded by
 * whitespace characters followed by another
 * word. In that case the whitespace characters
 * will be deleted.
 * 
 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
 * @version 1.0
 */
public class KEAPhraseFilter extends Filter implements OptionHandler {
	
	private static Logger log = LoggerFactory.getLogger(KEAPhraseFilter.class);
	
	/**
	 * 
	 */
	private static final long serialVersionUID = 1L;
	
	/** Stores which columns to select as a funky range */
	protected Range m_SelectCols = new Range();
	
	/** Determines whether internal periods are allowed */
	protected boolean m_DisallowInternalPeriods = false;
	
	/**
	 * Returns a string describing this filter
	 * 
	 * @return a description of the filter suitable for
	 *         displaying in the explorer/experimenter gui
	 */
	public String globalInfo() {
		return "This filter splits the text contained " + "by the selected string attributes into phrases.";
	}
	
	/**
	 * Returns an enumeration describing the available options
	 * 
	 * @return an enumeration of all the available options
	 */
	public Enumeration<Option> listOptions() {
		Vector<Option> newVector = new Vector<Option>(3);
		
		newVector.addElement(new Option("\tSpecify list of attributes to process. First and last are valid\n"
				+ "\tindexes. (default none)", "R", 1, "-R <index1,index2-index4,...>"));
		newVector.addElement(new Option("\tInvert matching sense", "V", 0, "-V"));
		newVector.addElement(new Option("\tDisallow internal periods", "P", 0, "-P"));
		
		return newVector.elements();
	}
	
	/**
	 * Returns the Capabilities of this filter.
	 * 
	 * @return the capabilities of this object
	 * @see Capabilities
	 */
	public Capabilities getCapabilities() {
		Capabilities result = super.getCapabilities();
		
		// attributes
		result.enableAllAttributes();
		result.enable(Capability.MISSING_VALUES);
		
		// class
		result.enable(Capability.NOMINAL_CLASS);
		result.enable(Capability.NO_CLASS);
		
		return result;
	}
	
	/**
	 * Parses a given list of options controlling the behaviour of this object.
	 * Valid options are:
	 * <p>
	 * 
	 * -R index1,index2-index4,...<br>
	 * Specify list of attributes to process. First and last are valid indexes. (default none)
	 * <p>
	 * 
	 * -V<br>
	 * Invert matching sense
	 * <p>
	 * 
	 * -P<br>
	 * Disallow internal periods
	 * <p>
	 * 
	 * @param options the list of options as an array of strings
	 * @exception Exception if an option is not supported
	 */
	public void setOptions(String[] options) throws Exception {
		String list = Utils.getOption('R', options);
		
		if (list.length() != 0) {
			setAttributeIndices(list);
		}
		
		setInvertSelection(Utils.getFlag('V', options));
		setDisallowInternalPeriods(Utils.getFlag('P', options));
		
		if (getInputFormat() != null) {
			setInputFormat(getInputFormat());
		}
	}
	
	/**
	 * Gets the current settings of the filter.
	 * 
	 * @return an array of strings suitable for passing to setOptions
	 */
	public String[] getOptions() {
		String[] options = new String[4];
		int current = 0;
		
		if (getInvertSelection()) {
			options[current++] = "-V";
		}
		
		if (getDisallowInternalPeriods()) {
			options[current++] = "-P";
		}
		
		if (!getAttributeIndices().equals("")) {
			options[current++] = "-R";
			options[current++] = getAttributeIndices();
		}
		
		while (current < options.length) {
			options[current++] = "";
		}
		
		return options;
	}
	
	/**
	 * Sets the format of the input instances.
	 * 
	 * @param instanceInfo an Instances object containing the input
	 *        instance structure (any instances contained in the object are
	 *        ignored - only the structure is required).
	 * @return true if the outputFormat may be collected immediately
	 */
	public boolean setInputFormat(Instances instanceInfo) throws Exception {
		super.setInputFormat(instanceInfo);
		setOutputFormat(instanceInfo);
		m_SelectCols.setUpper(instanceInfo.numAttributes() - 1);
		
		return true;
	}
	
	/**
	 * Input an instance for filtering. Ordinarily the instance is processed
	 * and made available for output immediately. Some filters require all
	 * instances be read before producing output.
	 * 
	 * @param instance the input instance
	 * @return true if the filtered instance may now be
	 *         collected with output().
	 * @exception Exception if the input instance was not of the correct
	 *            format or if there was a problem with the filtering.
	 */
	public boolean input(Instance instance) throws Exception {
		if (getInputFormat() == null) {
			throw new Exception("No input instance format defined");
		}
		
		if (m_NewBatch) {
			resetQueue();
			m_NewBatch = false;
		}
		
		convertInstance(instance);
		return true;
	}
	
	/**
	 * Signify that this batch of input to the filter is finished. If
	 * the filter requires all instances prior to filtering, output()
	 * may now be called to retrieve the filtered instances. Any
	 * subsequent instances filtered should be filtered based on setting
	 * obtained from the first batch (unless the inputFormat has been
	 * re-assigned or new options have been set). This default
	 * implementation assumes all instance processing occurs during
	 * inputFormat() and input().
	 * 
	 * @return true if there are instances pending output
	 * @exception NullPointerException if no input structure has been defined,
	 * @exception Exception if there was a problem finishing the batch.
	 */
	public boolean batchFinished() throws Exception {
		if (getInputFormat() == null) {
			throw new NullPointerException("No input instance format defined");
		}
		
		m_NewBatch = true;
		return (numPendingOutput() != 0);
	}
	
	/**
	 * Main method for testing this class.
	 * 
	 * @param argv should contain arguments to the filter: use -h for help
	 */
	public static void main(String[] argv) {
		try {
			if (Utils.getFlag('b', argv)) {
				Filter.batchFilterFile(new KEAPhraseFilter(), argv);
			} else {
				Filter.filterFile(new KEAPhraseFilter(), argv);
			}
		} catch (Exception ex) {
			log.error(ex.getMessage(), ex);
		}
	}
	
	/**
	 * Converts an instance by removing all non-alphanumeric characters
	 * from its string attribute values.
	 */
	private void convertInstance(Instance instance) throws Exception {
		double[] instVals = new double[instance.numAttributes()];
		
		for (int i = 0; i < instance.numAttributes(); i++) {
			if (!instance.attribute(i).isString() || instance.isMissing(i)) {
				instVals[i] = instance.value(i);
			} else {
				if (!m_SelectCols.isInRange(i)) {
					int index = getOutputFormat().attribute(i).addStringValue(instance.stringValue(i));
					instVals[i] = (double) index;
					continue;
				}
				
				// aly: str = text of the document
				String str = instance.stringValue(i);
				
				String tokenized = tokenize(str);
				
				// aly: resultStr is the clean version of str
				// log.info(resultStr.toString());
				int index = getOutputFormat().attribute(i).addStringValue(tokenized);
				instVals[i] = (double) index;
			}
		}
		
		Instance inst = new Instance(instance.weight(), instVals);
		inst.setDataset(getOutputFormat());
		push(inst);
	}
	
	public String tokenize(String str) {
		StringBuffer resultStr = new StringBuffer();
		int j = 0;
		boolean phraseStart = true;
		boolean seenNewLine = false;
		boolean haveSeenHyphen = false;
		boolean haveSeenSlash = false;
		
		while (j < str.length()) {
			boolean isWord = false;
			boolean potNumber = false;
			int startj = j;
			while (j < str.length()) {
				char ch = str.charAt(j);
				if (Character.isLetterOrDigit(ch)) {
					potNumber = true;
					isWord = true;
					// aly: allowing digits as words
					/*
					 * if (Character.isLetter(ch)) {
					 * isWord = true;
					 * }
					 */
					j++;
				} else if ((!m_DisallowInternalPeriods && (ch == '.')) || (ch == '@') || (ch == '_') || (ch == '&')
						|| (ch == '/') || (ch == '-')) {
					if ((j > 0) && (j + 1 < str.length()) && Character.isLetterOrDigit(str.charAt(j - 1))
							&& Character.isLetterOrDigit(str.charAt(j + 1))) {
						j++;
					} else {
						break;
					}
				} else if (ch == '\'') {
					if ((j > 0) && Character.isLetterOrDigit(str.charAt(j - 1))) {
						j++;
					} else {
						break;
					}
				} else {
					break;
				}
			}
			if (isWord == true) {
				if (!phraseStart) {
					if (haveSeenHyphen) {
						resultStr.append('-');
					} else if (haveSeenSlash) {
						resultStr.append('/');
					} else {
						resultStr.append(' ');
					}
				}
				
				resultStr.append(str.substring(startj, j));
				
				if (j == str.length()) {
					break;
				}
				
				phraseStart = false;
				seenNewLine = false;
				haveSeenHyphen = false;
				haveSeenSlash = false;
				
				if (Character.isWhitespace(str.charAt(j))) {
					if (str.charAt(j) == '\n') {
						seenNewLine = true;
					}
				} else if (str.charAt(j) == '-') {
					haveSeenHyphen = true;
				} else if (str.charAt(j) == '/') {
					haveSeenSlash = true;
				} else {
					phraseStart = true;
					resultStr.append('\n');
				}
				j++;
			} else if (j == str.length()) {
				break;
			} else if (str.charAt(j) == '\n') {
				if (seenNewLine) {
					if (phraseStart == false) {
						resultStr.append('\n');
						phraseStart = true;
					}
				} else if (potNumber) {
					if (phraseStart == false) {
						phraseStart = true;
						resultStr.append('\n');
					}
				}
				seenNewLine = true;
				j++;
			} else if (Character.isWhitespace(str.charAt(j))) {
				if (potNumber) {
					if (phraseStart == false) {
						phraseStart = true;
						resultStr.append('\n');
					}
				}
				j++;
			} else {
				if (phraseStart == false) {
					resultStr.append('\n');
					phraseStart = true;
				}
				j++;
			}
		}
		
		return resultStr.toString();
	}
	
	/**
	 * Returns the tip text for this property
	 * 
	 * @return tip text for this property suitable for
	 *         displaying in the explorer/experimenter gui
	 */
	public String invertSelectionTipText() {
		return "If set to false, the specified attributes will be processed;"
				+ " If set to true, specified attributes won't be processed.";
	}
	
	/**
	 * Get whether the supplied columns are to be processed
	 * 
	 * @return true if the supplied columns won't be processed
	 */
	public boolean getInvertSelection() {
		return m_SelectCols.getInvert();
	}
	
	/**
	 * Set whether selected columns should be processed. If true the
	 * selected columns won't be processed.
	 * 
	 * @param invert the new invert setting
	 */
	public void setInvertSelection(boolean invert) {
		m_SelectCols.setInvert(invert);
	}
	
	/**
	 * Returns the tip text for this property
	 * 
	 * @return tip text for this property suitable for
	 *         displaying in the explorer/experimenter gui
	 */
	public String disallowInternalPeriodsTipText() {
		return "If set to false, internal periods are allowed.";
	}
	
	/**
	 * Get whether the supplied columns are to be processed
	 * 
	 * @return true if the supplied columns won't be processed
	 */
	public boolean getDisallowInternalPeriods() {
		return m_DisallowInternalPeriods;
	}
	
	/**
	 * Set whether selected columns should be processed. If true the
	 * selected columns won't be processed.
	 * 
	 * @param disallow the new invert setting
	 */
	public void setDisallowInternalPeriods(boolean disallow) {
		m_DisallowInternalPeriods = disallow;
	}
	
	/**
	 * Returns the tip text for this property
	 * 
	 * @return tip text for this property suitable for
	 *         displaying in the explorer/experimenter gui
	 */
	public String attributeIndicesTipText() {
		return "Specify range of attributes to act on." + " This is a comma separated list of attribute indices, with"
				+ " \"first\" and \"last\" valid values. Specify an inclusive"
				+ " range with \"-\". E.g: \"first-3,5,6-10,last\".";
	}
	
	/**
	 * Get the current range selection.
	 * 
	 * @return a string containing a comma separated list of ranges
	 */
	public String getAttributeIndices() {
		return m_SelectCols.getRanges();
	}
	
	/**
	 * Set which attributes are to be processed
	 * 
	 * @param rangeList a string representing the list of attributes. Since
	 *        the string will typically come from a user, attributes are indexed from
	 *        1. <br>
	 *        eg: first-3,5,6-last
	 */
	public void setAttributeIndices(String rangeList) {
		m_SelectCols.setRanges(rangeList);
	}
	
	/**
	 * Set which attributes are to be processed
	 * 
	 * @param attributes an array containing indexes of attributes to select.
	 *        Since the array will typically come from a program, attributes are indexed
	 *        from 0.
	 */
	public void setAttributeIndicesArray(int[] attributes) {
		setAttributeIndices(Range.indicesToRangeList(attributes));
	}
}
